Merge pull request #950 from cantino/web_content_charset

Implement charset handling in WebRequestConcern

Akinori MUSHA 9 years ago
parent
commit
d14027c77d
3 changed files with 65 additions and 21 deletions
  1. 63 1
      app/concerns/web_request_concern.rb
  2. 1 0
      app/models/agents/rss_agent.rb
  3. 1 20
      app/models/agents/website_agent.rb

+ 63 - 1
app/concerns/web_request_concern.rb

@@ -14,6 +14,46 @@ module WebRequestConcern
14 14
     end
15 15
   end
16 16
 
17
+  class CharacterEncoding < Faraday::Middleware
18
+    def initialize(app, force_encoding: nil, default_encoding: nil, unzip: nil)
19
+      super(app)
20
+      @force_encoding   = force_encoding
21
+      @default_encoding = default_encoding
22
+      @unzip            = unzip
23
+    end
24
+
25
+    def call(env)
26
+      @app.call(env).on_complete do |env|
27
+        body = env[:body]
28
+
29
+        case @unzip
30
+        when 'gzip'.freeze
31
+          body.replace(ActiveSupport::Gzip.decompress(body))
32
+        end
33
+
34
+        case
35
+        when @force_encoding
36
+          encoding = @force_encoding
37
+        when body.encoding == Encoding::ASCII_8BIT
38
+          # Not all Faraday adapters support automatic charset
39
+          # detection, so we do that.
40
+          case env[:response_headers][:content_type]
41
+          when /;\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i
42
+            encoding = Encoding.find($1) rescue nil
43
+          when /\A\s*(?:text\/[^\s;]+|application\/(?:[^\s;]+\+)?(?:xml|json))\s*(?:;|\z)/i
44
+            encoding = @default_encoding
45
+          else
46
+            # Never try to transcode a binary content
47
+            return
48
+          end
49
+        end
50
+        body.encode!(Encoding::UTF_8, encoding) unless body.encoding == Encoding::UTF_8
51
+      end
52
+    end
53
+  end
54
+
55
+  Faraday::Response.register_middleware character_encoding: CharacterEncoding
56
+
17 57
   extend ActiveSupport::Concern
18 58
 
19 59
   def validate_web_request_options!
@@ -34,6 +74,23 @@ module WebRequestConcern
34 74
     rescue ArgumentError => e
35 75
       errors.add(:base, e.message)
36 76
     end
77
+
78
+    if (encoding = options['force_encoding']).present?
79
+      case encoding
80
+      when String
81
+        begin
82
+          Encoding.find(encoding)
83
+        rescue ArgumentError
84
+          errors.add(:base, "Unknown encoding: #{encoding.inspect}")
85
+        end
86
+      else
87
+        errors.add(:base, "force_encoding must be a string")
88
+      end
89
+    end
90
+  end
91
+
92
+  def default_encoding
93
+    Encoding::UTF_8
37 94
   end
38 95
 
39 96
   def faraday
@@ -44,6 +101,11 @@ module WebRequestConcern
44 101
     }
45 102
 
46 103
     @faraday ||= Faraday.new(faraday_options) { |builder|
104
+      builder.response :character_encoding,
105
+                       force_encoding: interpolated['force_encoding'].presence,
106
+                       default_encoding: default_encoding,
107
+                       unzip: interpolated['unzip'].presence
108
+
47 109
       builder.headers = headers if headers.length > 0
48 110
 
49 111
       builder.headers[:user_agent] = user_agent
@@ -51,7 +113,7 @@ module WebRequestConcern
51 113
       builder.use FaradayMiddleware::FollowRedirects
52 114
       builder.request :url_encoded
53 115
 
54
-      if boolify(options['disable_url_encoding'])
116
+      if boolify(interpolated['disable_url_encoding'])
55 117
         builder.options.params_encoder = DoNotEncoder
56 118
       end
57 119
 

+ 1 - 0
app/models/agents/rss_agent.rb

@@ -29,6 +29,7 @@ module Agents
29 29
           * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
30 30
           * `disable_ssl_verification` - Set to `true` to disable ssl verification.
31 31
           * `disable_url_encoding` - Set to `true` to disable url encoding.
32
+          * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header.  Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
32 33
           * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
33 34
           * `max_events_per_run` - Limit number of events created (items parsed) per run for feed.
34 35
 

+ 1 - 20
app/models/agents/website_agent.rb

@@ -87,7 +87,7 @@ module Agents
87 87
 
88 88
       Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance).  This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
89 89
 
90
-      Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset.
90
+      Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header.  Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
91 91
 
92 92
       Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`).
93 93
 
@@ -157,19 +157,6 @@ module Agents
157 157
         errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
158 158
       end
159 159
 
160
-      if (encoding = options['force_encoding']).present?
161
-        case encoding
162
-        when String
163
-          begin
164
-            Encoding.find(encoding)
165
-          rescue ArgumentError
166
-            errors.add(:base, "Unknown encoding: #{encoding.inspect}")
167
-          end
168
-        else
169
-          errors.add(:base, "force_encoding must be a string")
170
-        end
171
-      end
172
-
173 160
       validate_web_request_options!
174 161
     end
175 162
 
@@ -284,12 +271,6 @@ module Agents
284 271
       interpolation_context.stack {
285 272
         interpolation_context['_response_'] = ResponseDrop.new(response)
286 273
         body = response.body
287
-        if (encoding = interpolated['force_encoding']).present?
288
-          body = body.encode(Encoding::UTF_8, encoding)
289
-        end
290
-        if interpolated['unzip'] == "gzip"
291
-          body = ActiveSupport::Gzip.decompress(body)
292
-        end
293 274
         doc = parse(body)
294 275
 
295 276
         if extract_full_json?